library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)


options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

Load social distancing data

Load the Safegraph social distancing data

# bay_blockgroups <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/bay_blockgroups.rds")

bay_sd <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/bay_socialdistancing_v2.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date())

# obtaining weekends
weekends <- bay_sd %>% 
  filter(!duplicated(date)) %>% 
  arrange(date) %>% 
  mutate(weekend = ifelse((date %>% as.numeric()) %% 7 %in% c(2,3), T, F)) %>% 
  dplyr::select(date,weekend)

bay_sd <- bay_sd %>% left_join(weekends)

# date of the shelter in place order
shelter_start <- "2020-03-16" %>% as.Date()

# store an average of percent of devices completely at home since the shelter in place order started on weekdays
bay_sd_at_home_average <- bay_sd %>% 
  filter(weekend == F) %>% 
  filter(date > shelter_start) %>%
  group_by(origin_census_block_group) %>% 
  summarize(completely_home_device_count = sum(completely_home_device_count), device_count = sum(device_count)) %>% 
  mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1))

# store average of percent of devices completely at home for January and February on weekdays
bay_pre_sd_at_home_average <- bay_sd %>% 
  filter(weekend == F) %>% 
  filter(date <  as.Date("2020-03-01")) %>%
  group_by(origin_census_block_group) %>% 
  summarize(completely_home_device_count = sum(completely_home_device_count), device_count = sum(device_count)) %>% 
  mutate(`% Completely at Home Pre Shelter` = (completely_home_device_count/device_count*100) %>% round(1))

bay_sd_at_home_average <- bay_sd_at_home_average %>% left_join(bay_pre_sd_at_home_average %>% dplyr::select(origin_census_block_group, `% Completely at Home Pre Shelter`))

I next obtain various demographic data and plot them against social distancing behavior, and examine for correlations.

# obtain the saved census data 
setwd("~/Documents/2020 Spring Quarter/CEE 218Z")
acs_vars = readRDS("censusData2018_acs_acs5.rds")
setwd("~/Documents/2020 Spring Quarter/CEE 218Z/covid19")

# get FIPS codes for CA counties
bay_area_counties <- lapply(fips("CA", c("Alameda", "Contra Costa", "Marin", "Napa", "San Francisco", "San Mateo", "Santa Clara", "Solano", "Sonoma")), function(x) substr(x,3,5))

# define a function for pulling census data
pullCensus <- function(variableToPull, counties) {
  censusData <- NULL
  for (i in 1:length(counties)) {
    county <- counties[i]
    regionString <- paste0("state:06+county:", county)
    censusDataCounty <- getCensus(
      name = "acs/acs5",
      vintage = 2018,
      region = "block group:*", 
      regionin = regionString,
      vars = variableToPull
    ) %>%
    mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
      select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
    censusData <- rbind(censusData, censusDataCounty)
  }
  
  return(censusData)
}

Income

# load in income data - code adapted from other students
bay_median_income_by_block <-
  pullCensus("B19013_001E", bay_area_counties) %>% 
  filter(blockgroup %in% bay_sd$origin_census_block_group) %>%
  rename(
    Median_Income = B19013_001E 
  ) %>% 
  filter(!is.na(Median_Income)) %>% 
  left_join(bay_sd_at_home_average, by = c("blockgroup" = "origin_census_block_group")) %>% 
  filter(!is.na(device_count)) 

bay_ami_by_block <-
  pullCensus("group(B19001)", bay_area_counties) %>%
  dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  filter(blockgroup %in% bay_sd$origin_census_block_group) %>%
  group_by(blockgroup) %>% 
  summarize(
    Total = B19001_001E,
    `Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
    #sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
    `Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E)
  ) %>% 
  mutate(
    `% under 75,000` = `Under 75,000` / Total * 100,
    `% under 100,000` = `Under 100,000` / Total * 100
  ) %>% 
  left_join(bay_median_income_by_block %>% dplyr::select(-Median_Income)
  ) %>% 
  filter(!is.na(device_count))

# plotting
bay_ami_by_block %>% 
  ggplot(aes(
  x = `% under 75,000`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $75,000 annually",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Households Below $75,000"
  )

income_75_model <- lm(`% Completely at Home` ~ `% under 75,000`, bay_ami_by_block)
summary(income_75_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% under 75,000`, data = bay_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.615  -4.965   0.576   5.531  25.000 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      52.615083   0.281849  186.68   <2e-16 ***
## `% under 75,000` -0.152449   0.006425  -23.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.538 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.1064, Adjusted R-squared:  0.1062 
## F-statistic:   563 on 1 and 4728 DF,  p-value: < 2.2e-16
# income - less than $100000
bay_ami_by_block %>% 
  ggplot(aes(
  x = `% under 100,000`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $100,000 annually",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Households Below $100,000"
  )

income_100_model <- lm(`% Completely at Home` ~ `% under 100,000`, bay_ami_by_block)
summary(income_100_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% under 100,000`, data = bay_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.543  -4.765   0.590   5.545  23.900 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       54.542971   0.326103  167.26   <2e-16 ***
## `% under 100,000` -0.155785   0.005934  -26.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.438 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.1272, Adjusted R-squared:  0.127 
## F-statistic: 689.2 on 1 and 4728 DF,  p-value: < 2.2e-16

Compare to pre-shelter-in-place behavior:

bay_ami_by_block %>% 
  ggplot(aes(
  x = `% under 75,000`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $75,000 annually",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Households Below $75,000 Pre Shelter-in-Place"
  )

income_75_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% under 75,000`, bay_ami_by_block)
summary(income_75_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% under 75,000`, 
##     data = bay_ami_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.435  -3.290  -0.245   2.942  30.095 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      17.730991   0.165171  107.35   <2e-16 ***
## `% under 75,000`  0.113036   0.003765   30.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.003 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.1601, Adjusted R-squared:  0.1599 
## F-statistic: 901.3 on 1 and 4728 DF,  p-value: < 2.2e-16
# income - less than $100000
bay_ami_by_block %>% 
  ggplot(aes(
  x = `% under 100,000`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) +
  labs(
    x = "Percent of housholds with incomes under $100,000 annually",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Households Below $100,000 Pre Shelter-in-Place"
  )

income_100_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% under 100,000`, bay_ami_by_block)
summary(income_100_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% under 100,000`, 
##     data = bay_ami_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.9262  -3.3078  -0.3057   2.9109  30.2334 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       16.639978   0.192226   86.56   <2e-16 ***
## `% under 100,000`  0.108863   0.003498   31.12   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.974 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:   0.17,  Adjusted R-squared:  0.1699 
## F-statistic: 968.6 on 1 and 4728 DF,  p-value: < 2.2e-16

Language

# loading in language data - code adapted from other students
bay_lang_by_block <-
  pullCensus("group(B16004)", bay_area_counties) %>%
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(
    key = "variable",
    value = "estimate", 
    - blockgroup
  ) %>% 
  left_join(acs_vars, by = c("variable" = "name")) %>% 
  mutate(
    tier = substr(label,lapply(label, function(x) max(unlist(gregexpr('!!',x)))+2),nchar(label))
  ) %>% 
  filter(tier %in% c('Speak English "not well"', 
                     'Speak English "not at all"', 
                     'Total', 'Speak Spanish', 
                     'Speak Asian and Pacific Island languages')) %>% 
  group_by(blockgroup, tier) %>% 
  summarise(
    estimate1 = sum(estimate)
  ) %>% 
  spread(
    key = "tier",
    value = "estimate1"
  ) %>% 
  mutate(
    `% speaking english < well` = (`Speak English "not well"` + `Speak English "not at all"`) / Total * 100,
    `% speaking spanish` = (`Speak Spanish`/ Total) * 100,
    `% speaking api` = (`Speak Asian and Pacific Island languages` / Total) * 100
  ) %>% 
  left_join(bay_median_income_by_block %>% dplyr::select(-Median_Income)) %>% 
  filter(!is.na(device_count)) %>% 
  mutate(log_perc = log(`% speaking english < well`))

# plotting
bay_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking english < well`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking English less than well",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and English Language Ability"
  )

english_ability_model <- lm(`% Completely at Home` ~ `% speaking english < well`, bay_lang_by_block)
summary(english_ability_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% speaking english < well`, 
##     data = bay_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.028  -5.520   0.373   5.825  28.695 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 46.633533   0.177956 262.052   <2e-16 ***
## `% speaking english < well` -0.007336   0.015542  -0.472    0.637    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.081 on 4734 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  4.706e-05,  Adjusted R-squared:  -0.0001642 
## F-statistic: 0.2228 on 1 and 4734 DF,  p-value: 0.6369
bay_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking spanish`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking Spanish",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Spanish Language Ability"
  )

spanish_speaking_model <- lm(`% Completely at Home` ~ `% speaking spanish`, bay_lang_by_block)
summary(spanish_speaking_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `% speaking spanish`, data = bay_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.864  -5.124   0.483   5.715  28.442 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          48.457820   0.174164   278.2   <2e-16 ***
## `% speaking spanish` -0.117490   0.007343   -16.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.845 on 4734 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.05131,    Adjusted R-squared:  0.05111 
## F-statistic:   256 on 1 and 4734 DF,  p-value: < 2.2e-16

Compare to pre-shelter-in-place behavior:

bay_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking english < well`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking English less than well",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and English Language Ability Pre Shelter-in-Place"
  )

english_ability_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% speaking english < well`, bay_lang_by_block)
summary(english_ability_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% speaking english < well`, 
##     data = bay_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.914  -3.676  -0.362   3.136  33.866 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 20.855961   0.103496  201.51   <2e-16 ***
## `% speaking english < well`  0.173148   0.009037   19.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.279 on 4732 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.07199,    Adjusted R-squared:  0.0718 
## F-statistic: 367.1 on 1 and 4732 DF,  p-value: < 2.2e-16
bay_lang_by_block %>% 
  ggplot(aes(
  x = `% speaking spanish`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of individuals speaking Spanish",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Spanish Language Ability Pre Shelter-in-Place"
  )

spanish_speaking_model2 <- lm(`% Completely at Home Pre Shelter` ~ `% speaking spanish`, bay_lang_by_block)
summary(spanish_speaking_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `% speaking spanish`, 
##     data = bay_lang_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.869  -3.603  -0.413   3.159  32.848 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          20.853569   0.104039  200.44   <2e-16 ***
## `% speaking spanish`  0.083275   0.004386   18.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.283 on 4732 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.0708, Adjusted R-squared:  0.07061 
## F-statistic: 360.6 on 1 and 4732 DF,  p-value: < 2.2e-16

Age

# loading in age data - specifically looking at percentage 65+ and percentage <30
bay_age_by_block <- 
  pullCensus("group(B01001)", bay_area_counties) %>%
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(
    key = "variable",
    value = "estimate", 
    - blockgroup
  ) %>% 
  mutate(
    label = acs_vars$label[match(variable,acs_vars$name)]
  ) %>% 
  select(-variable) %>% 
  separate(
    label,
    into = c(NA,NA,"sex","age"),
    sep = "!!"
  ) %>% filter(!is.na(age)) %>% 
  mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>% 
  group_by(blockgroup) %>% 
  summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T)) %>% 
  mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total) %>% 
  left_join(bay_median_income_by_block %>% dplyr::select(-Median_Income)) %>% 
  filter(!is.na(device_count)) 

# plotting
bay_age_by_block %>%
  ggplot(aes(
  x = `percent less than 30`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
labs(
    x = "Percent of residents younger than 30",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Young Age Groups"
  )

young_model <- lm(bay_age_by_block$`% Completely at Home` ~ bay_age_by_block$`percent less than 30`)
summary(young_model)
## 
## Call:
## lm(formula = bay_age_by_block$`% Completely at Home` ~ bay_age_by_block$`percent less than 30`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.096  -5.313   0.292   5.739  30.226 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             50.85035    0.46160 110.161   <2e-16
## bay_age_by_block$`percent less than 30` -0.12054    0.01249  -9.652   <2e-16
##                                            
## (Intercept)                             ***
## bay_age_by_block$`percent less than 30` ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.993 on 4734 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.0193, Adjusted R-squared:  0.01909 
## F-statistic: 93.16 on 1 and 4734 DF,  p-value: < 2.2e-16
bay_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
  ggplot(aes(
  x = `percent elderly`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of residents ages 65 and older",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Elderly Population"
  )

elderly_model <- lm(`% Completely at Home` ~ `percent elderly`, bay_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent elderly`, data = bay_age_by_block %>% 
##     filter(`percent elderly` < 50))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.229  -5.505   0.352   5.766  29.349 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       45.79736    0.28066 163.178  < 2e-16 ***
## `percent elderly`  0.05361    0.01621   3.308 0.000946 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.05 on 4693 degrees of freedom
## Multiple R-squared:  0.002327,   Adjusted R-squared:  0.002114 
## F-statistic: 10.94 on 1 and 4693 DF,  p-value: 0.0009461

Compare to pre-shelter-in-place behavior:

bay_age_by_block %>%
  ggplot(aes(
  x = `percent less than 30`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
labs(
    x = "Percent of residents younger than 30",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Young Age Groups Pre Shelter-in-Place"
  )

young_model2 <- lm(bay_age_by_block$`% Completely at Home Pre Shelter` ~ bay_age_by_block$`percent less than 30`)
summary(young_model2)
## 
## Call:
## lm(formula = bay_age_by_block$`% Completely at Home Pre Shelter` ~ 
##     bay_age_by_block$`percent less than 30`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.296  -3.749  -0.283   3.332  33.649 
## 
## Coefficients:
##                                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             21.226471   0.281877  75.304  < 2e-16
## bay_age_by_block$`percent less than 30`  0.027096   0.007631   3.551 0.000388
##                                            
## (Intercept)                             ***
## bay_age_by_block$`percent less than 30` ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.473 on 4732 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.002657,   Adjusted R-squared:  0.002447 
## F-statistic: 12.61 on 1 and 4732 DF,  p-value: 0.0003878
bay_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
  ggplot(aes(
  x = `percent elderly`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of residents ages 65 and older",
    y = "Percent devices completely at home onw eekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Elderly Population Pre Shelter-in-Place"
  )

elderly_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent elderly`, bay_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent elderly`, 
##     data = bay_age_by_block %>% filter(`percent elderly` < 50))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.810  -3.681  -0.331   3.316  32.714 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       23.453206   0.167597 139.938   <2e-16 ***
## `percent elderly` -0.085606   0.009675  -8.848   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.399 on 4691 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.01642,    Adjusted R-squared:  0.01621 
## F-statistic: 78.29 on 1 and 4691 DF,  p-value: < 2.2e-16

Vehicles available

# also get data on vehicles available as households without a vehicle
bay_no_vehicles_by_block <- pullCensus("group(B25044)", bay_area_counties) %>%
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>%
  separate(label, into = c(NA, NA, NA,"vehicles"), sep = "!!") %>% 
  filter(!is.na(vehicles)) %>%
  group_by(blockgroup, vehicles) %>%
  summarize(grouped_vehicles = sum(estimate)) %>%
  spread(key = vehicles, value = grouped_vehicles) %>%
  mutate(total_nums = `1 vehicle available` + `2 vehicles available` + `3 vehicles available` + `4 vehicles available` + `5 or more vehicles available` + `No vehicle available`, `percent no vehicles` = `No vehicle available`*100 / total_nums, `log percent` = log(`percent no vehicles`)) %>%
  left_join(bay_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count))

# plotting
bay_no_vehicles_by_block %>% 
  ggplot(aes(
  x = `percent no vehicles`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with no vehicle available",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Vehicle Availability"
  )

vehicles_model <- lm(`% Completely at Home` ~ `percent no vehicles`, bay_no_vehicles_by_block)
summary(vehicles_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent no vehicles`, 
##     data = bay_no_vehicles_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.150  -5.519   0.323   5.838  28.323 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           46.97701    0.15808 297.166  < 2e-16 ***
## `percent no vehicles` -0.04342    0.01048  -4.143 3.49e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.016 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.003617,   Adjusted R-squared:  0.003406 
## F-statistic: 17.16 on 1 and 4728 DF,  p-value: 3.491e-05

Compare to pre-shelter-in-place behavior:

bay_no_vehicles_by_block %>% 
  ggplot(aes(
  x = `percent no vehicles`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of housholds with no vehicle available",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Vehicle Availability Pre Shelter-in-Place"
  )

vehicles_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent no vehicles`, bay_no_vehicles_by_block)
summary(vehicles_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent no vehicles`, 
##     data = bay_no_vehicles_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.1954  -3.5354  -0.2145   3.2767  30.3199 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           21.180091   0.092106  229.95   <2e-16 ***
## `percent no vehicles`  0.118945   0.006106   19.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.253 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.07429,    Adjusted R-squared:  0.0741 
## F-statistic: 379.5 on 1 and 4728 DF,  p-value: < 2.2e-16

Occupants per room

# get data on occupants per room
bay_occupants_per_room_by_block <- pullCensus("group(B25014)", bay_area_counties) %>%
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>% 
  filter(!is.na(`occupants per room`)) %>%
  group_by(blockgroup, `occupants per room`) %>%
  summarize(estimate_tot = sum(estimate)) %>% 
  spread(key = `occupants per room`, value = estimate_tot) %>%
  mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`, `percent 1 or more` = (`1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) * 100/ total_nums) %>%
  left_join(bay_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count)) 

# plotting
bay_occupants_per_room_by_block %>% 
  ggplot(aes(
  x = `percent 1 or more`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with more than 1 occupant per room",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Room Occupancy"
  )

occupants_model <- lm(`% Completely at Home` ~ `percent 1 or more`, bay_occupants_per_room_by_block)
summary(occupants_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent 1 or more`, data = bay_occupants_per_room_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.191  -5.440   0.355   5.746  28.102 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         47.19777    0.16630 283.812  < 2e-16 ***
## `percent 1 or more` -0.08743    0.01529  -5.719 1.14e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.001 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.00687,    Adjusted R-squared:  0.00666 
## F-statistic: 32.71 on 1 and 4728 DF,  p-value: 1.138e-08

Compare to pre-shelter-in-place behavior:

bay_occupants_per_room_by_block %>% 
  ggplot(aes(
  x = `percent 1 or more`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households with more than 1 occupant per room",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Room Occupancy Pre Shelter-in-Place"
  )

occupants_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent 1 or more`, bay_occupants_per_room_by_block)
summary(occupants_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent 1 or more`, 
##     data = bay_occupants_per_room_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.116  -3.671  -0.316   3.161  30.361 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         21.138916   0.097823   216.1   <2e-16 ***
## `percent 1 or more`  0.155550   0.008993    17.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.295 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.05951,    Adjusted R-squared:  0.05931 
## F-statistic: 299.2 on 1 and 4728 DF,  p-value: < 2.2e-16

Education

bay_education_by_block <- pullCensus("group(B15003)", bay_area_counties) %>%
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, "education level"), sep = "!!") %>% 
  mutate(`education level` = replace_na(`education level`, "total_educ")) %>% # if the education level field is NA, this corresponded to the total number in that blockgroup
  spread(key = `education level`, value = estimate) %>%
  mutate(`percent associates or higher` = (`Associate's degree` + `Bachelor's degree` + `Doctorate degree` + `Master's degree`)*100/total_educ, `percent less than associates` = 100-`percent associates or higher`) %>% 
  left_join(bay_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count)) 

# plotting
bay_education_by_block %>% 
  ggplot(aes(
  x = `percent less than associates`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of people without an degree at Associate's level or higher",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and Education"
  )

educ_model <- lm(`% Completely at Home` ~ `percent less than associates`, bay_education_by_block)
summary(educ_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent less than associates`, 
##     data = bay_education_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.727  -4.859   0.715   5.691  26.864 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    53.47890    0.33405  160.09   <2e-16 ***
## `percent less than associates` -0.13917    0.00625  -22.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.628 on 4733 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.09482,    Adjusted R-squared:  0.09463 
## F-statistic: 495.8 on 1 and 4733 DF,  p-value: < 2.2e-16

Compare to pre-shelter-in-place behavior:

bay_education_by_block %>% 
  ggplot(aes(
  x = `percent less than associates`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of people without an degree at Associate's level or higher",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and Education Pre Shelter-in-Place"
  )

educ_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent less than associates`, bay_education_by_block)
summary(educ_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent less than associates`, 
##     data = bay_education_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3499  -3.4893  -0.4311   3.0081  31.2636 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    17.595763   0.199664   88.13   <2e-16 ***
## `percent less than associates`  0.092690   0.003737   24.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.155 on 4732 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.1151, Adjusted R-squared:  0.1149 
## F-statistic: 615.3 on 1 and 4732 DF,  p-value: < 2.2e-16

High speed internet access

Motivated by this paper https://www.nber.org/papers/w26982.pdf on social distancing, internet access, and inequality, we look at whether a household has “Broadband (high-speed) Internet service such as cable, fiber optic, or DSL service,” and staying at home.

bay_internet_by_block <- pullCensus("group(B28002)", bay_area_counties) %>%
  select(-c(contains("EA"),contains("MA"),contains("M"))) %>% 
  gather(key = "variable", value = "estimate", -blockgroup) %>% 
  mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>% 
  select(-variable) %>% 
  separate(label, into = c(NA, NA, "subscription", "type", "additional"), sep = "!!") %>% 
  filter(is.na(subscription) | (type == "Broadband such as cable, fiber optic or DSL") & is.na(additional)) %>% 
  mutate(type = replace_na(type, "total_num")) %>% 
  dplyr::select(blockgroup, type, estimate) %>%
  spread(key = type, value = estimate) %>%
  mutate(`percent high speed` = `Broadband such as cable, fiber optic or DSL`*100/total_num, `percent no high speed` = 100-`percent high speed`) %>% 
  left_join(bay_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
  filter(!is.na(device_count)) 

# plotting
bay_internet_by_block %>% 
  ggplot(aes(
  x = `percent no high speed`,
  y = `% Completely at Home`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households without broadband such as cable, fiber optic or DSL",
    y = "Percent devices completely at home on weekdays since shelter-in-place",
    title = "Bay Area: Social Distancing and High Speed Internet"
  )

internet_model <- lm(`% Completely at Home` ~ `percent no high speed`, bay_internet_by_block)
summary(internet_model)
## 
## Call:
## lm(formula = `% Completely at Home` ~ `percent no high speed`, 
##     data = bay_internet_by_block)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.141  -5.086   0.442   5.622  28.112 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             50.656273   0.228034  222.14   <2e-16 ***
## `percent no high speed` -0.196780   0.009262  -21.25   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.63 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.08715,    Adjusted R-squared:  0.08695 
## F-statistic: 451.4 on 1 and 4728 DF,  p-value: < 2.2e-16

Compare to pre-shelter-in-place behavior:

bay_internet_by_block %>% 
  ggplot(aes(
  x = `percent no high speed`,
  y = `% Completely at Home Pre Shelter`
)) + geom_point() + 
  geom_smooth(method=lm) + 
  labs(
    x = "Percent of households without broadband such as cable, fiber optic or DSL",
    y = "Percent devices completely at home on weekdays pre-shelter-in-place",
    title = "Bay Area: Staying at Home and High Speed Internet Pre Shelter-in-Place"
  )

internet_model2 <- lm(`% Completely at Home Pre Shelter` ~ `percent no high speed`, bay_internet_by_block)
summary(internet_model2)
## 
## Call:
## lm(formula = `% Completely at Home Pre Shelter` ~ `percent no high speed`, 
##     data = bay_internet_by_block)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.8729  -3.5344  -0.1585   3.1107  29.3893 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             19.52334    0.13663  142.89   <2e-16 ***
## `percent no high speed`  0.12937    0.00555   23.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.171 on 4728 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.1031, Adjusted R-squared:  0.1029 
## F-statistic: 543.4 on 1 and 4728 DF,  p-value: < 2.2e-16

Multiple regression analysis

Multiple regression analysis with income, education, and internet

# multiple regression 
modeltest <- lm(bay_ami_by_block$`% Completely at Home` ~ bay_ami_by_block$`% under 100,000` + bay_education_by_block$`percent less than associates` + bay_internet_by_block$`percent no high speed`)
summary(modeltest)
## 
## Call:
## lm(formula = bay_ami_by_block$`% Completely at Home` ~ bay_ami_by_block$`% under 100,000` + 
##     bay_education_by_block$`percent less than associates` + bay_internet_by_block$`percent no high speed`)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.421  -4.717   0.644   5.622  25.069 
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                           55.091416   0.350698
## bay_ami_by_block$`% under 100,000`                    -0.102925   0.008964
## bay_education_by_block$`percent less than associates` -0.038937   0.008728
## bay_internet_by_block$`percent no high speed`         -0.063835   0.012003
##                                                       t value Pr(>|t|)    
## (Intercept)                                           157.091  < 2e-16 ***
## bay_ami_by_block$`% under 100,000`                    -11.481  < 2e-16 ***
## bay_education_by_block$`percent less than associates`  -4.461 8.34e-06 ***
## bay_internet_by_block$`percent no high speed`          -5.318 1.09e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.384 on 4726 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.1387, Adjusted R-squared:  0.1382 
## F-statistic: 253.8 on 3 and 4726 DF,  p-value: < 2.2e-16